In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cross_validation import train_test_split
from textblob import TextBlob,Word
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd


C:\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [8]:
train = pd.read_csv('files/restaurant_review1.csv', encoding = "ISO-8859-1")

In [9]:
train=train.dropna(axis=0)

In [10]:
def stemming_tokenizer(text):
    stopwords = []
    with open('files/new_sw.txt','r+') as f:
        stopword = f.readlines()
        for sw in stopword:
            sw = sw[:-1]
            stopwords.append(sw)
    f.close()
    text = text.lower()
    words = TextBlob(text).correct().words
    words = [Word(w).lemmatize("v") for w in words if not w in stopwords]
    return words

Classifier for food


In [31]:
food_classifier = Pipeline([
        ('vectorizer', CountVectorizer(tokenizer=stemming_tokenizer,ngram_range=(1,3))),
        ('tfidf', TfidfTransformer()),
        ('classifier', KNeighborsClassifier(n_neighbors=5)),
        ])

In [36]:
X_train, X_test, y_train, y_test = train_test_split(train['review'],train['Food'], test_size=0.25, random_state=33)

In [37]:
classifier=food_classifier.fit(X_train, y_train)
print("Accuracy: %s" % classifier.score(X_test, y_test))


Accuracy: 0.697674418605

Classifier for service


In [38]:
service_classifier = Pipeline([
        ('vectorizer', CountVectorizer(tokenizer=stemming_tokenizer,ngram_range=(1,3))),
        ('tfidf', TfidfTransformer()),
        ('classifier', KNeighborsClassifier(n_neighbors=5)),
        ])

X_train, X_test, y_train, y_test = train_test_split(train['review'],train['Service'], test_size=0.25, random_state=33)

classifier=service_classifier.fit(X_train, y_train)
print("Accuracy: %s" % classifier.score(X_test, y_test))


Accuracy: 0.697674418605

Classifier for ambience


In [39]:
amb_classifier = Pipeline([
        ('vectorizer', CountVectorizer(tokenizer=stemming_tokenizer,ngram_range=(1,3))),
        ('tfidf', TfidfTransformer()),
        ('classifier', KNeighborsClassifier(n_neighbors=5)),
        ])

X_train, X_test, y_train, y_test = train_test_split(train['review'],train['Ambience'], test_size=0.25, random_state=33)

classifier=amb_classifier.fit(X_train, y_train)
print("Accuracy: %s" % classifier.score(X_test, y_test))


Accuracy: 0.720930232558

Classifier for deals


In [40]:
deal_classifier = Pipeline([
        ('vectorizer', CountVectorizer(tokenizer=stemming_tokenizer,ngram_range=(1,3))),
        ('tfidf', TfidfTransformer()),
        ('classifier', KNeighborsClassifier(n_neighbors=5)),
        ])

X_train, X_test, y_train, y_test = train_test_split(train['review'],train['deal'], test_size=0.25, random_state=33)

classifier=deal_classifier.fit(X_train, y_train)

print("Accuracy: %s" % classifier.score(X_test, y_test))


Accuracy: 0.860465116279

get reviews from a restaurant with id = res_id


In [44]:
df = pd.read_csv('files/reviews.csv', encoding = "ISO-8859-1")

In [119]:
df.head()


Out[119]:
funny rating user_id review restaurant_id review_id date cool useful
0 0 5 24538 My wife took me here on my birthday for breakf... 3010 1 2011-01-26 2 5
1 0 5 40413 I have no idea why some people give bad review... 1191 2 2011-07-27 0 0
2 0 4 36383 love the gyro plate. Rice is so good and I als... 1989 3 2012-06-14 0 1
3 1 4 25790 Quiessence is, simply put, beautiful. Full wi... 3566 4 2007-12-13 4 3
4 4 5 16256 Drop what you're doing and drive here. After I... 1019 5 2010-02-12 7 7

In [120]:
def getReview(res_id):
    reviews = []
    
    rev = df[(df['restaurant_id']==res_id) & (df['useful']>0)]['review']
    for r in rev:
        reviews.append(r)
    return reviews

In [46]:
rev = getReview(3)

In [53]:
pdf= pd.read_csv('files/restaurants.csv',encoding = "ISO-8859-1")

In [56]:
pred= pdf[['new_id']]

In [63]:
pred.columns=['restaurant_id']

In [64]:
pred['food']=-1


C:\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [65]:
pred['service']=-1
pred['ambience']=-1
pred['deals']=-1


C:\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
C:\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
C:\Anaconda3\lib\site-packages\ipykernel\__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()

In [3]:
import pickle

In [136]:
def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [144]:
save_obj(food_classifier, 'food_classifier')
save_obj(service_classifier, 'service_classifier')
save_obj(amb_classifier, 'amb_classifier')
save_obj(deal_classifier, 'deal_classifier')

In [5]:
def load_obj(name ):
    with open( name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [11]:
f= load_obj('food_classifier')
s= load_obj('service_classifier')
a= load_obj('amb_classifier')
d= load_obj('deal_classifier')

In [25]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [27]:
X_train, X_test, y_train, y_test = train_test_split(train['review'],train['Food'], test_size=0.25, random_state=33)
print(classification_report(y_test,f.predict(X_test)))


             precision    recall  f1-score   support

        0.0       0.75      0.35      0.48        17
        1.0       0.69      0.92      0.79        26

avg / total       0.71      0.70      0.67        43


In [29]:
X_train, X_test, y_train, y_test = train_test_split(train['review'],train['Service'], test_size=0.25, random_state=33)
print(classification_report(y_test,s.predict(X_test)))


             precision    recall  f1-score   support

        0.0       0.76      0.83      0.79        30
        1.0       0.50      0.38      0.43        13

avg / total       0.68      0.70      0.69        43


In [30]:
X_train, X_test, y_train, y_test = train_test_split(train['review'],train['Ambience'], test_size=0.25, random_state=33)
print(classification_report(y_test,a.predict(X_test)))


             precision    recall  f1-score   support

        0.0       0.70      1.00      0.82        28
        1.0       1.00      0.20      0.33        15

avg / total       0.80      0.72      0.65        43


In [31]:
X_train, X_test, y_train, y_test = train_test_split(train['review'],train['deal'], test_size=0.25, random_state=33)
print(classification_report(y_test,d.predict(X_test)))


             precision    recall  f1-score   support

        0.0       0.85      1.00      0.92        35
        1.0       1.00      0.25      0.40         8

avg / total       0.88      0.86      0.82        43


In [132]:
def getPrediction(id):
    rev= getReview(id)
    food_count = 0 
    serv_count = 0
    amb_count = 0
    deal_count = 0

    for review in rev:
        prediction = {}
        prediction["food"] = food_classifier.predict([review])[0]
        prediction["service"] = service_classifier.predict([review])[0]
        prediction["amb"] = amb_classifier.predict([review])[0]
        prediction["deals"] = deal_classifier.predict([review])[0]

        if(prediction["food"]==1):
            food_count += 1
        if(prediction["service"]==1):
            serv_count += 1
        if(prediction["amb"]==1):
            amb_count += 1
        if(prediction["deals"]==1):
            deal_count += 1

    #     print(prediction)

    rev_size = len(rev)
    if(rev_size> 0 ):
        food_per = food_count/rev_size*100
        serv_per = serv_count/rev_size*100
        amb_per = amb_count/rev_size*100
        deal_per = deal_count/rev_size*100
    else:
        food_per= 0 
        serv_per=0
        amb_per=0
        deal_per=0 

    # print(rev_size,food_per,serv_per,amb_per,deal_per)
    return (rev_size,food_per,serv_per,amb_per,deal_per)

In [127]:
for id in pred[pred['food']==-1].restaurant_id:
    i,a,b,c,d= getPrediction(id)
    pred.iloc[id-1,1]= a
    pred.iloc[id-1,2]= b 
    pred.iloc[id-1,3]= c 
    pred.iloc[id-1,4]= d 
    print(i)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-127-afe62aa5d5d9> in <module>()
      1 for id in pred[pred['food']==-1].restaurant_id:
----> 2     i,a,b,c,d= getPrediction(id)
      3     pred.iloc[id-1,1]= a
      4     pred.iloc[id-1,2]= b
      5     pred.iloc[id-1,3]= c

<ipython-input-121-bc461ddd612a> in getPrediction(id)
     10         prediction["food"] = food_classifier.predict([review])[0]
     11         prediction["service"] = service_classifier.predict([review])[0]
---> 12         prediction["amb"] = amb_classifier.predict([review])[0]
     13         prediction["deals"] = deal_classifier.predict([review])[0]
     14 

C:\Anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
     52 
     53         # lambda, but not partial, allows help() to work with update_wrapper
---> 54         out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
     55         # update the docstring of the returned function
     56         update_wrapper(out, self.fn)

C:\Anaconda3\lib\site-packages\sklearn\pipeline.py in predict(self, X)
    324         for name, transform in self.steps[:-1]:
    325             if transform is not None:
--> 326                 Xt = transform.transform(Xt)
    327         return self.steps[-1][-1].predict(Xt)
    328 

C:\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in transform(self, raw_documents)
    891 
    892         # use the same matrix-building strategy as fit_transform
--> 893         _, X = self._count_vocab(raw_documents, fixed_vocab=True)
    894         if self.binary:
    895             X.data.fill(1)

C:\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
    760         for doc in raw_documents:
    761             feature_counter = {}
--> 762             for feature in analyze(doc):
    763                 try:
    764                     feature_idx = vocabulary[feature]

C:\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(doc)
    239 
    240             return lambda doc: self._word_ngrams(
--> 241                 tokenize(preprocess(self.decode(doc))), stop_words)
    242 
    243         else:

<ipython-input-30-3182fd224a51> in stemming_tokenizer(text)
      8     f.close()
      9     text = text.lower()
---> 10     words = TextBlob(text).correct().words
     11     words = [Word(w).lemmatize("v") for w in words if not w in stopwords]
     12     return words

C:\Anaconda3\lib\site-packages\textblob\blob.py in correct(self)
    553         tokens = nltk.tokenize.regexp_tokenize(self.raw, "\w+|[^\w\s]|\s")
    554         corrected = (Word(w).correct() for w in tokens)
--> 555         ret = ''.join(corrected)
    556         return self.__class__(ret)
    557 

C:\Anaconda3\lib\site-packages\textblob\blob.py in <genexpr>(.0)
    552         # regex matches: word or punctuation or whitespace
    553         tokens = nltk.tokenize.regexp_tokenize(self.raw, "\w+|[^\w\s]|\s")
--> 554         corrected = (Word(w).correct() for w in tokens)
    555         ret = ''.join(corrected)
    556         return self.__class__(ret)

C:\Anaconda3\lib\site-packages\textblob\blob.py in correct(self)
    125         .. versionadded:: 0.6.0
    126         '''
--> 127         return Word(self.spellcheck()[0][0])
    128 
    129     @cached_property

C:\Anaconda3\lib\site-packages\textblob\blob.py in spellcheck(self)
    117         .. versionadded:: 0.6.0
    118         '''
--> 119         return suggest(self.string)
    120 
    121     def correct(self):

C:\Anaconda3\lib\site-packages\textblob\en\__init__.py in suggest(w)
    121     """ Returns a list of (word, confidence)-tuples of spelling corrections.
    122     """
--> 123     return spelling.suggest(w)
    124 
    125 def polarity(s, **kwargs):

C:\Anaconda3\lib\site-packages\textblob\_text.py in suggest(self, w)
   1396         candidates = self._known([w]) \
   1397                   or self._known(self._edit1(w)) \
-> 1398                   or self._known(self._edit2(w)) \
   1399                   or [w]
   1400         candidates = [(self.get(c, 0.0), c) for c in candidates]

C:\Anaconda3\lib\site-packages\textblob\_text.py in _edit2(self, w)
   1373         # Of all spelling errors, 99% is covered by edit distance 2.
   1374         # Only keep candidates that are actually known words (20% speedup).
-> 1375         return set(e2 for e1 in self._edit1(w) for e2 in self._edit1(e1) if e2 in self)
   1376 
   1377     def _known(self, words=[]):

C:\Anaconda3\lib\site-packages\textblob\_text.py in <genexpr>(.0)
   1373         # Of all spelling errors, 99% is covered by edit distance 2.
   1374         # Only keep candidates that are actually known words (20% speedup).
-> 1375         return set(e2 for e1 in self._edit1(w) for e2 in self._edit1(e1) if e2 in self)
   1376 
   1377     def _known(self, words=[]):

C:\Anaconda3\lib\site-packages\textblob\_text.py in _edit1(self, w)
   1363             [a + b[1:] for a, b in split if b],
   1364             [a + b[1] + b[0] + b[2:] for a, b in split if len(b) > 1],
-> 1365             [a + c + b[1:] for a, b in split for c in Spelling.ALPHA if b],
   1366             [a + c + b[0:] for a, b in split for c in Spelling.ALPHA]
   1367         )

C:\Anaconda3\lib\site-packages\textblob\_text.py in <listcomp>(.0)
   1363             [a + b[1:] for a, b in split if b],
   1364             [a + b[1] + b[0] + b[2:] for a, b in split if len(b) > 1],
-> 1365             [a + c + b[1:] for a, b in split for c in Spelling.ALPHA if b],
   1366             [a + c + b[0:] for a, b in split for c in Spelling.ALPHA]
   1367         )

KeyboardInterrupt: 

In [125]:
rev = getReview(96)

In [126]:
len(rev)


Out[126]:
244

In [131]:
pred.head()


Out[131]:
restaurant_id food service ambience deals
0 1 100.000000 23.076923 0.000000 0.0
1 2 62.500000 25.000000 12.500000 0.0
2 3 100.000000 25.000000 0.000000 0.0
3 4 100.000000 4.000000 6.000000 0.0
4 5 90.909091 31.818182 4.545455 0.0

In [130]:
pred[(pred['food']==-1)]


Out[130]:
restaurant_id food service ambience deals
95 96 -1.0 -1.0 -1.0 -1.0
96 97 -1.0 -1.0 -1.0 -1.0
97 98 -1.0 -1.0 -1.0 -1.0
98 99 -1.0 -1.0 -1.0 -1.0
99 100 -1.0 -1.0 -1.0 -1.0
100 101 -1.0 -1.0 -1.0 -1.0
101 102 -1.0 -1.0 -1.0 -1.0
102 103 -1.0 -1.0 -1.0 -1.0
103 104 -1.0 -1.0 -1.0 -1.0
104 105 -1.0 -1.0 -1.0 -1.0
105 106 -1.0 -1.0 -1.0 -1.0
106 107 -1.0 -1.0 -1.0 -1.0
107 108 -1.0 -1.0 -1.0 -1.0
108 109 -1.0 -1.0 -1.0 -1.0
109 110 -1.0 -1.0 -1.0 -1.0
110 111 -1.0 -1.0 -1.0 -1.0
111 112 -1.0 -1.0 -1.0 -1.0
112 113 -1.0 -1.0 -1.0 -1.0
113 114 -1.0 -1.0 -1.0 -1.0
114 115 -1.0 -1.0 -1.0 -1.0
115 116 -1.0 -1.0 -1.0 -1.0
116 117 -1.0 -1.0 -1.0 -1.0
117 118 -1.0 -1.0 -1.0 -1.0
118 119 -1.0 -1.0 -1.0 -1.0
119 120 -1.0 -1.0 -1.0 -1.0
120 121 -1.0 -1.0 -1.0 -1.0
121 122 -1.0 -1.0 -1.0 -1.0
122 123 -1.0 -1.0 -1.0 -1.0
123 124 -1.0 -1.0 -1.0 -1.0
124 125 -1.0 -1.0 -1.0 -1.0
... ... ... ... ... ...
4473 4474 -1.0 -1.0 -1.0 -1.0
4474 4475 -1.0 -1.0 -1.0 -1.0
4475 4476 -1.0 -1.0 -1.0 -1.0
4476 4477 -1.0 -1.0 -1.0 -1.0
4477 4478 -1.0 -1.0 -1.0 -1.0
4478 4479 -1.0 -1.0 -1.0 -1.0
4479 4480 -1.0 -1.0 -1.0 -1.0
4480 4481 -1.0 -1.0 -1.0 -1.0
4481 4482 -1.0 -1.0 -1.0 -1.0
4482 4483 -1.0 -1.0 -1.0 -1.0
4483 4484 -1.0 -1.0 -1.0 -1.0
4484 4485 -1.0 -1.0 -1.0 -1.0
4485 4486 -1.0 -1.0 -1.0 -1.0
4486 4487 -1.0 -1.0 -1.0 -1.0
4487 4488 -1.0 -1.0 -1.0 -1.0
4488 4489 -1.0 -1.0 -1.0 -1.0
4489 4490 -1.0 -1.0 -1.0 -1.0
4490 4491 -1.0 -1.0 -1.0 -1.0
4491 4492 -1.0 -1.0 -1.0 -1.0
4492 4493 -1.0 -1.0 -1.0 -1.0
4493 4494 -1.0 -1.0 -1.0 -1.0
4494 4495 -1.0 -1.0 -1.0 -1.0
4495 4496 -1.0 -1.0 -1.0 -1.0
4496 4497 -1.0 -1.0 -1.0 -1.0
4497 4498 -1.0 -1.0 -1.0 -1.0
4498 4499 -1.0 -1.0 -1.0 -1.0
4499 4500 -1.0 -1.0 -1.0 -1.0
4500 4501 -1.0 -1.0 -1.0 -1.0
4501 4502 -1.0 -1.0 -1.0 -1.0
4502 4503 -1.0 -1.0 -1.0 -1.0

4408 rows × 5 columns


In [129]:
pred.to_csv("prediction.csv", index=False)

In [149]:
pred[(pred['food']==-1)& (pred['restaurant_id']>=1000)]


Out[149]:
restaurant_id food service ambience deals
999 1000 -1.0 -1.0 -1.0 -1.0
1000 1001 -1.0 -1.0 -1.0 -1.0
1001 1002 -1.0 -1.0 -1.0 -1.0
1002 1003 -1.0 -1.0 -1.0 -1.0
1003 1004 -1.0 -1.0 -1.0 -1.0
1004 1005 -1.0 -1.0 -1.0 -1.0
1005 1006 -1.0 -1.0 -1.0 -1.0
1006 1007 -1.0 -1.0 -1.0 -1.0
1007 1008 -1.0 -1.0 -1.0 -1.0
1008 1009 -1.0 -1.0 -1.0 -1.0
1009 1010 -1.0 -1.0 -1.0 -1.0
1010 1011 -1.0 -1.0 -1.0 -1.0
1011 1012 -1.0 -1.0 -1.0 -1.0
1012 1013 -1.0 -1.0 -1.0 -1.0
1013 1014 -1.0 -1.0 -1.0 -1.0
1014 1015 -1.0 -1.0 -1.0 -1.0
1015 1016 -1.0 -1.0 -1.0 -1.0
1016 1017 -1.0 -1.0 -1.0 -1.0
1017 1018 -1.0 -1.0 -1.0 -1.0
1018 1019 -1.0 -1.0 -1.0 -1.0
1019 1020 -1.0 -1.0 -1.0 -1.0
1020 1021 -1.0 -1.0 -1.0 -1.0
1021 1022 -1.0 -1.0 -1.0 -1.0
1022 1023 -1.0 -1.0 -1.0 -1.0
1023 1024 -1.0 -1.0 -1.0 -1.0
1024 1025 -1.0 -1.0 -1.0 -1.0
1025 1026 -1.0 -1.0 -1.0 -1.0
1026 1027 -1.0 -1.0 -1.0 -1.0
1027 1028 -1.0 -1.0 -1.0 -1.0
1028 1029 -1.0 -1.0 -1.0 -1.0
... ... ... ... ... ...
4473 4474 -1.0 -1.0 -1.0 -1.0
4474 4475 -1.0 -1.0 -1.0 -1.0
4475 4476 -1.0 -1.0 -1.0 -1.0
4476 4477 -1.0 -1.0 -1.0 -1.0
4477 4478 -1.0 -1.0 -1.0 -1.0
4478 4479 -1.0 -1.0 -1.0 -1.0
4479 4480 -1.0 -1.0 -1.0 -1.0
4480 4481 -1.0 -1.0 -1.0 -1.0
4481 4482 -1.0 -1.0 -1.0 -1.0
4482 4483 -1.0 -1.0 -1.0 -1.0
4483 4484 -1.0 -1.0 -1.0 -1.0
4484 4485 -1.0 -1.0 -1.0 -1.0
4485 4486 -1.0 -1.0 -1.0 -1.0
4486 4487 -1.0 -1.0 -1.0 -1.0
4487 4488 -1.0 -1.0 -1.0 -1.0
4488 4489 -1.0 -1.0 -1.0 -1.0
4489 4490 -1.0 -1.0 -1.0 -1.0
4490 4491 -1.0 -1.0 -1.0 -1.0
4491 4492 -1.0 -1.0 -1.0 -1.0
4492 4493 -1.0 -1.0 -1.0 -1.0
4493 4494 -1.0 -1.0 -1.0 -1.0
4494 4495 -1.0 -1.0 -1.0 -1.0
4495 4496 -1.0 -1.0 -1.0 -1.0
4496 4497 -1.0 -1.0 -1.0 -1.0
4497 4498 -1.0 -1.0 -1.0 -1.0
4498 4499 -1.0 -1.0 -1.0 -1.0
4499 4500 -1.0 -1.0 -1.0 -1.0
4500 4501 -1.0 -1.0 -1.0 -1.0
4501 4502 -1.0 -1.0 -1.0 -1.0
4502 4503 -1.0 -1.0 -1.0 -1.0

3504 rows × 5 columns


In [ ]:


In [ ]: